import pandas as pd
from pandas import json_normalize
from sklearn.manifold import TSNE
import requests
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import cufflinks as cf
import pandas as pd
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import plotly.graph_objs as go
import nltk.corpus
from textblob import TextBlob
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from plotly.offline import iplot
from matplotlib.pyplot import figure
#nltk.download('stopwords')
warnings.filterwarnings("ignore")
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
%matplotlib inline


apikey = open("creds_guardian.txt").read().strip()

def search_guardian_articles(api_key, search_term='', page=1, page_size=1000, format_='json'):
    '''Retrieves meta data of articles matching the search term'''
    search_term = search_term.replace(' ', '%20')
    
    # Now, we'll make the request
    url = 'https://content.guardianapis.com/search'
    params = {'api-key':apikey,
             'format':'json',
              'page':page,
              'page-size':page_size,
             'q':search_term}
    response = requests.get(url, params=params)
    return response.json()


def guardian_articles_dataframe(api_key, search_term='', number_of_records=1000):
    '''Returns a dataframe with article information from the Guardian API
    
    var:
        search_term: Query string passed to the Guardian API to search the server database
        
        api_key: key required to access the Guardian API. Available for free from Guardian Developer website
        
        number_of_records: Indicates the  number of records to return in the dataframe'''
    # Instantiate the pandas dataframe
    df = pd.DataFrame()
    
    # Iterate through a series of API calls to retrieve the records, and append to dataframe
    for i in range(1,int(number_of_records / 200 + 1)):
        try:
            data = json_normalize(search_guardian_articles(api_key=apikey, search_term=search_term, page_size=200, page=i)['response']['results'])
            df = df.append(data)
        except:
            break
    
    # Reset index and return the dataframe
    df = df.reset_index(drop=True)
    return df[:number_of_records]


# Search the Guardian API with a term to get our dataset
term = 'Ukraine Russia War'

df = guardian_articles_dataframe(api_key=apikey, search_term=term, number_of_records=1000)
df.head()


# Filtering Unnecessary Columns
df = df.filter(items=['sectionName','webTitle','webUrl','pillarName'])


# Extracting all those rows with Pillarname = "News"
news = df['pillarName'] == "News"


df = df[news]


stop_words = stopwords.words('english')
df['clean_title'] = df['webTitle'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


stop_words = stopwords.words('english') + ['At', 'v', '3']
# apply the same code as above but assign
# a new column to see the differences
df['clean_title_v2'] = df['webTitle'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


# Using Regex to remove unwanted text
def preprocess(webTitle):
    webTitle = webTitle.str.replace("(<br/>)", "")
    webTitle = webTitle.str.replace('(<a).*(>).*(</a>)', '')
    webTitle = webTitle.str.replace('(&amp)', '')
    webTitle = webTitle.str.replace('(&gt)', '')
    webTitle = webTitle.str.replace('(&lt)', '')
    webTitle = webTitle.str.replace('(\xa0)', ' ')  
    return webTitle


df['clean_title_v2'] = preprocess(df['clean_title_v2'])
df['polarity'] = df['webTitle'].map(lambda text: TextBlob(text).sentiment.polarity)
df['webTitle_len'] = df['webTitle'].astype(str).apply(len)
df['word_count'] = df['webTitle'].apply(lambda x: len(str(x).split()))


print('5 random reviews with the highest positive sentiment polarity: \n')
cl = df.loc[df.polarity > 0, ['clean_title_v2']].sample(5).values
for c in cl:
    print(c[0])

5 random reviews with the highest positive sentiment polarity: 

Kherson fell quickly, Ukraine’s progress east Dnipro harder
Russia-Ukraine war latest: know day 177 invasion
Russia-Ukraine war latest: know day 185 invasion
‘Ukraine definitely win’ says president visit Mykolaiv
Russia-Ukraine war latest: know day 202 invasion


print('5 random reviews with the most neutral sentiment(zero) polarity: \n')
cl = df.loc[df.polarity == 0, ['clean_title_v2']].sample(5).values
for c in cl:
    print(c[0])

5 random reviews with the most neutral sentiment(zero) polarity: 

Russia-Ukraine war: Russia accused demolishing Mariupol theatre ‘to hide war crimes’ – happened
Russia bombs Kharkiv Ukraine claims ‘tactical successes’
Russia-Ukraine war glance: know day 285 invasion
Russia-Ukraine war: know day 151 invasion
‘It’s madness’: Ukraine holds breath Putin turns nuclear plant frontline


print('5 reviews with the most negative polarity: \n')
cl = df.loc[df.polarity <0, ['clean_title_v2']].sample(5).values
for c in cl:
    print(c[0])

5 reviews with the most negative polarity: 

Ukrainian adviser quits claims Russian missile killed dozens
Saudi foreign minister defends role securing Ukraine prisoner swaps
Hundreds civilians trapped Soledar amid fierce fighting, Ukraine says
Global carbon emissions forecast cut due Ukraine war Biden, says BP
Why west risks condemning Ukraine slow strangulation


#Check Missing Values
percent_missing = df.isnull().sum() * 100 / len(df) 
percent_missing = round(percent_missing, 2)
percent_missing = percent_missing.astype(str) + '%'
percent_missing

sectionName       0.0%
webTitle          0.0%
webUrl            0.0%
pillarName        0.0%
clean_title       0.0%
clean_title_v2    0.0%
polarity          0.0%
webTitle_len      0.0%
word_count        0.0%
dtype: object


#Checking Number of duplicate Rows
dup_percentage = df.duplicated().sum()/len(df)*100
dup_percentage = round(dup_percentage,2)
dup_percentage = dup_percentage.astype(str) + '%'
dup_percentage

'2.41%'


#Dropping Duplicate Rows
df = df.drop_duplicates()


#Checking Duplicate rows again
dup_percentage = df.duplicated().sum()/len(df)*100
dup_percentage = round(dup_percentage,2)
dup_percentage = dup_percentage.astype(str) + '%'
dup_percentage

'0.0%'


#Finalized df
df.head()


#Extracting all the news about war in Australia into a Dataframe
df_aus = df[df.sectionName == 'Australia news']


df_aus.head()


#Extracting all the news about war in UK into a Dataframe
df_uk = df[df.sectionName == 'UK news']


df_uk.head()


df.to_json('Guardian.json') 
df_aus.to_json('Guardian_Aus.json')
df_uk.to_json('Guardian_UK.json')


text = df_aus['clean_title_v2'].values 

wordcloud = WordCloud().generate(str(text))

figure(figsize=(12, 8), dpi=80)
plt.imshow(wordcloud)
plt.axis("off")
plt.title("Australia News Sentiments Word Cloud")
plt.show()


text = df_uk['clean_title_v2'].values 

wordcloud = WordCloud().generate(str(text))

figure(figsize=(12, 8), dpi=80)
plt.imshow(wordcloud)
plt.title("UK News Sentiments Word Cloud")
plt.axis("off")
plt.show()


# Creating histogram
df_aus['polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution in Australia')


# Creating histogram
df_uk['polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution in UK')


# Creating histogram
df_aus['webTitle_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='review length',
    linecolor='black',
    yTitle='count',
    title='webTitle Length Distribution in Aus')


# Creating histogram
df_uk['webTitle_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='review length',
    linecolor='black',
    yTitle='count',
    title='webTitle Length Distribution in Uk')


# Creating histogram
df_aus['word_count'].iplot(
    kind='hist',
    bins=100,
    xTitle='word count',
    linecolor='black',
    yTitle='count',
    title='Webtitle Word Count Distribution in Australia')


# Creating histogram
df_uk['word_count'].iplot(
    kind='hist',
    bins=100,
    xTitle='word count',
    linecolor='black',
    yTitle='count',
    title='Webtitle Word Count Distribution in UK')


# Creating bar Chart
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(df_aus['clean_title_v2'], 20)
#for word, freq in common_words:
#    print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['clean_title_v2' , 'count'])
df2.groupby('clean_title_v2').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in Webtitle in Australia')


# Creating bar Chart
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(df_uk['clean_title_v2'], 20)
#for word, freq in common_words:
#    print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['clean_title_v2' , 'count'])
df2.groupby('clean_title_v2').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in Webtitle in UK')


# Creating bar Chart
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(df_aus['clean_title_v2'], 20)
#for word, freq in common_words:
#    print(word, freq)
df4 = pd.DataFrame(common_words, columns = ['clean_title_v2' , 'count'])
df4.groupby('clean_title_v2').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in WebTitle in Australia')


# Creating bar Chart
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(df_uk['clean_title_v2'], 20)
#for word, freq in common_words:
#    print(word, freq)
df4 = pd.DataFrame(common_words, columns = ['clean_title_v2' , 'count'])
df4.groupby('clean_title_v2').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in WebTitle in UK')


# Creating bar Chart
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(df_aus['clean_title_v2'], 20)
#for word, freq in common_words:
#    print(word, freq)
df6 = pd.DataFrame(common_words, columns = ['clean_title_v2' , 'count'])
df6.groupby('clean_title_v2').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in WebTitle in Australia')


# Creating bar Chart
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(df_uk['clean_title_v2'], 20)
#for word, freq in common_words:
#    print(word, freq)
df6 = pd.DataFrame(common_words, columns = ['clean_title_v2' , 'count'])
df6.groupby('clean_title_v2').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in WebTitle in UK')


# Creating Box Plot
y0 = df.loc[df['sectionName'] == 'UK news']['polarity']
y1 = df.loc[df['sectionName'] == 'Australia news']['polarity']
y2 = df.loc[df['sectionName'] == 'World news']['polarity']

trace0 = go.Box(
    y=y0,
    name = 'UK news',
    marker = dict(
        color = 'rgb(214, 12, 140)',
    )
)
trace1 = go.Box(
    y=y1,
    name = 'Australia news',
    marker = dict(
        color = 'rgb(0, 128, 128)',
    )
)
trace2 = go.Box(
    y=y2,
    name = 'World news',
    marker = dict(
        color = 'rgb(10, 140, 208)',
    )
)
data = [trace0, trace1, trace2]
layout = go.Layout(
    title = "Sentiment Polarity News in Australia, UK and Around the World "
)

fig = go.Figure(data=data,layout=layout)
iplot(fig, filename = "Sentiment Polarity News in Australia, UK and Around the World ")

	id	type	sectionId	sectionName	webPublicationDate	webTitle	webUrl	apiUrl	isHosted	pillarId	pillarName
0	world/2022/dec/02/finland-pm-sanna-marin-says-...	article	world	World news	2022-12-02T06:23:23Z	Finland PM Sanna Marin says Europe is ‘not str...	https://www.theguardian.com/world/2022/dec/02/...	https://content.guardianapis.com/world/2022/de...	False	pillar/news	News
1	world/live/2023/feb/05/russia-ukraine-war-situ...	liveblog	world	World news	2023-02-05T18:07:46Z	Russia-Ukraine war live: Ukraine ‘expects poss...	https://www.theguardian.com/world/live/2023/fe...	https://content.guardianapis.com/world/live/20...	False	pillar/news	News
2	world/live/2023/jan/28/russia-ukraine-war-zele...	liveblog	world	World news	2023-01-28T17:56:20Z	Russia-Ukraine war live: Ukraine struggling to...	https://www.theguardian.com/world/live/2023/ja...	https://content.guardianapis.com/world/live/20...	False	pillar/news	News
3	world/live/2022/dec/26/russia-ukraine-war-live...	liveblog	world	World news	2022-12-26T19:52:39Z	Russia-Ukraine war live: Ukraine aiming for pe...	https://www.theguardian.com/world/live/2022/de...	https://content.guardianapis.com/world/live/20...	False	pillar/news	News
4	world/live/2023/jan/11/russia-ukraine-war-live...	liveblog	world	World news	2023-01-11T19:01:42Z	Russia-Ukraine war: Putin replaces general in ...	https://www.theguardian.com/world/live/2023/ja...	https://content.guardianapis.com/world/live/20...	False	pillar/news	News

	sectionName	webTitle	webUrl	pillarName	clean_title	clean_title_v2	polarity	webTitle_len	word_count
0	World news	Finland PM Sanna Marin says Europe is ‘not str...	https://www.theguardian.com/world/2022/dec/02/...	News	Finland PM Sanna Marin says Europe ‘not strong...	Finland PM Sanna Marin says Europe ‘not strong...	-0.108333	72	13
1	World news	Russia-Ukraine war live: Ukraine ‘expects poss...	https://www.theguardian.com/world/live/2023/fe...	News	Russia-Ukraine war live: Ukraine ‘expects poss...	Russia-Ukraine war live: Ukraine ‘expects poss...	0.049716	103	15
2	World news	Russia-Ukraine war live: Ukraine struggling to...	https://www.theguardian.com/world/live/2023/ja...	News	Russia-Ukraine war live: Ukraine struggling ho...	Russia-Ukraine war live: Ukraine struggling ho...	0.018182	98	15
3	World news	Russia-Ukraine war live: Ukraine aiming for pe...	https://www.theguardian.com/world/live/2022/de...	News	Russia-Ukraine war live: Ukraine aiming peace ...	Russia-Ukraine war live: Ukraine aiming peace ...	0.005682	91	13
4	World news	Russia-Ukraine war: Putin replaces general in ...	https://www.theguardian.com/world/live/2023/ja...	News	Russia-Ukraine war: Putin replaces general cha...	Russia-Ukraine war: Putin replaces general cha...	0.025000	98	16

	sectionName	webTitle	webUrl	pillarName	clean_title	clean_title_v2	polarity	webTitle_len	word_count
133	Australia news	Australian man who died fighting in Ukraine re...	https://www.theguardian.com/australia-news/202...	News	Australian man died fighting Ukraine remembere...	Australian man died fighting Ukraine remembere...	0.000000	79	12
204	Australia news	Australia and France agree arms deal for Ukrai...	https://www.theguardian.com/australia-news/202...	News	Australia France agree arms deal Ukraine talks...	Australia France agree arms deal Ukraine talks...	0.000000	84	15
207	Australia news	Family mourns death of ‘treasured and loved’ A...	https://www.theguardian.com/australia-news/202...	News	Family mourns death ‘treasured loved’ Australi...	Family mourns death ‘treasured loved’ Australi...	0.350000	70	11
259	Australia news	Defence minister hails ‘heroic’ Ukraine counte...	https://www.theguardian.com/australia-news/liv...	News	Defence minister hails ‘heroic’ Ukraine counte...	Defence minister hails ‘heroic’ Ukraine counte...	0.700000	73	10
273	Australia news	Morning Mail: ‘Phantom’ carbon credits reveale...	https://www.theguardian.com/australia-news/202...	News	Morning Mail: ‘Phantom’ carbon credits reveale...	Morning Mail: ‘Phantom’ carbon credits reveale...	0.285714	107	15

	sectionName	webTitle	webUrl	pillarName	clean_title	clean_title_v2	polarity	webTitle_len	word_count
65	UK news	Two missing Britons killed in Ukraine while ev...	https://www.theguardian.com/uk-news/2023/jan/2...	News	Two missing Britons killed Ukraine evacuating ...	Two missing Britons killed Ukraine evacuating ...	-0.20	76	11
94	UK news	UK seeks more German support as it confirms Ch...	https://www.theguardian.com/uk-news/2023/jan/1...	News	UK seeks German support confirms Challenger ta...	UK seeks German support confirms Challenger ta...	0.25	72	12
228	UK news	Liverpool to host Eurovision song contest on b...	https://www.theguardian.com/uk-news/2022/oct/0...	News	Liverpool host Eurovision song contest behalf ...	Liverpool host Eurovision song contest behalf ...	0.00	62	10
334	UK news	Migration to UK rises to record 504,000 with U...	https://www.theguardian.com/uk-news/2022/nov/2...	News	Migration UK rises record 504,000 Ukraine Hong...	Migration UK rises record 504,000 Ukraine Hong...	0.00	74	13
733	UK news	Two more Britons captured in Ukraine could fac...	https://www.theguardian.com/uk-news/2022/jul/0...	News	Two Britons captured Ukraine could face death ...	Two Britons captured Ukraine could face death ...	0.50	61	10

Guardian News Sentiment Analysis¶

Importing Required Libraries¶

Setting API Key and Creating Functions to Extract Data from Guardian API to Pandas Dataframe¶

Cleaning the Dataset¶

Creating Australia and UK new Dataframe¶

Storing all Dataframes into json file¶

Creating WordCloud of the Australia and UK Dataframes¶

Comparing the Sentiments of UK and Australia News¶

Insights:¶